## Warning: package 'e1071' was built under R version 4.0.5
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.1     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## Warning: package 'plotly' was built under R version 4.0.4
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## Loading required package: usethis
## Warning: package 'usethis' was built under R version 4.0.4
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.0.5
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
house_votes_Rep = read_csv("~/Fall21/introDS/DS-3001-New/data/house_votes_Rep.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Last.Name = col_character(),
##   party.labels = col_character(),
##   aye = col_double(),
##   nay = col_double(),
##   other = col_double()
## )
table(house_votes_Rep$party.labels)
## 
##   Democrat Republican 
##        198        229
#View(house_votes_Rep)

Goal: Know how to make decisions and answer questions using clustering.

Repeat the clustering process only using the Rep house votes dataset - What differences and similarities did you see between how the clustering worked for the datasets?

The main difference is now the republican cluster is the one with more aye votes and less nay votes vs the dem house votes dataset has the democratic cluster with more aye votes and less nay votes.

#Select the variables to be included in the cluster 
clust_data_Rep = house_votes_Rep[, c("aye", "nay", "other")]
#Run the clustering algo with 2 centers
set.seed(1)
kmeans_obj_Rep = kmeans(clust_data_Rep, centers = 2, 
                        algorithm = "Lloyd")
#View the results
kmeans_obj_Rep
## K-means clustering with 2 clusters of sizes 225, 202
## 
## Cluster means:
##         aye      nay     other
## 1 122.56889 106.9956  90.43556
## 2  70.32673 145.6337 104.03960
## 
## Clustering vector:
##   [1] 2 2 2 2 1 1 1 2 1 2 1 1 1 1 2 1 2 2 1 1 1 1 2 2 1 2 1 2 1 2 2 2 2 1 1 2 1
##  [38] 1 2 1 2 2 2 2 2 1 2 2 1 2 2 2 2 2 1 1 2 1 2 2 2 2 2 1 1 1 1 2 2 1 2 2 2 2
##  [75] 2 1 1 2 1 1 1 2 1 1 1 1 2 2 2 2 1 1 1 2 2 2 1 2 2 2 2 1 1 2 1 2 1 1 1 1 1
## [112] 1 2 2 2 1 1 2 1 1 2 2 1 1 1 1 1 1 2 2 2 2 2 1 2 1 2 1 1 2 1 2 2 2 2 2 1 1
## [149] 2 2 1 1 1 2 1 2 1 2 1 2 1 1 2 1 2 1 2 1 1 1 2 2 1 2 2 2 2 1 1 1 1 2 1 2 2
## [186] 1 1 1 1 2 1 2 1 2 2 2 1 1 2 2 1 2 2 2 1 2 1 1 1 2 1 1 1 1 2 2 1 1 2 1 2 1
## [223] 2 2 2 1 1 1 1 1 1 2 2 1 1 1 2 2 1 2 2 2 2 1 1 1 1 1 2 2 2 2 1 1 2 2 2 1 1
## [260] 1 2 1 2 1 2 1 2 1 2 1 1 1 1 2 1 1 1 2 2 2 2 2 2 2 1 1 2 2 1 1 1 1 1 2 2 2
## [297] 2 1 2 1 1 1 1 2 1 2 2 1 2 2 1 1 1 2 2 1 2 1 1 2 1 1 2 2 1 1 2 2 1 1 1 2 1
## [334] 2 1 1 1 2 2 2 1 1 1 1 1 1 1 2 1 1 2 2 2 1 1 1 2 1 1 1 1 2 1 2 1 1 2 2 2 2
## [371] 1 1 1 2 1 2 2 2 1 2 1 1 2 1 1 2 1 2 1 1 1 1 1 2 1 2 2 1 2 2 1 1 1 1 1 2 1
## [408] 1 2 2 1 2 1 1 1 1 1 2 1 2 1 2 1 2 2 1 2
## 
## Within cluster sum of squares by cluster:
## [1] 43093.49 77671.01
##  (between_SS / total_SS =  79.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
#Visualize the output
party_clusters_Rep = as.factor(kmeans_obj_Rep$cluster)
ggplot(house_votes_Rep, aes(x = aye, 
                            y = nay,
                            color = party.labels,  #<- tell R how to color 
                            #   the data points
                            shape = party_clusters_Rep)) + 
  geom_point(size = 6) +
  ggtitle("Aye vs. Nay votes for Republican-introduced bills") +
  xlab("Number of Aye Votes") +
  ylab("Number of Nay Votes") +
  scale_shape_manual(name = "Cluster", 
                     labels = c("Cluster 1", "Cluster 2"),
                     values = c("1", "2")) +
  scale_color_manual(name = "Party",         #<- tell R which colors to use and
                     #   which labels to include in the legend
                     labels = c("Deomcratic", "Republican"),
                     values = c("blue", "red")) +
  theme_light()

#save as a png
ggsave("US House Votes for Rep Bills.png", 
       width = 10, 
       height = 5.62, 
       units = "in")
#Evaluate the quality of the clustering 

# Inter-cluster variance,
# "betweenss" is the sum of the distances between points 
# from different clusters.
num_Rep = kmeans_obj_Rep$betweenss

# Total variance, "totss" is the sum of the distances
# between all the points in the data set.
denom_Rep = kmeans_obj_Rep$totss

# Variance accounted for by clusters.
(var_exp_Rep = num_Rep / denom_Rep)
## [1] 0.7952692
#Use the function we created to evaluate several different number of clusters

# The function explained_variance wraps our code for calculating 
# the variance explained by clustering.
explained_variance = function(data_in, k){
  
  # Running the kmeans algorithm.
  set.seed(1)
  kmeans_obj = kmeans(data_in, centers = k, algorithm = "Lloyd", iter.max = 30)
  
  # Variance accounted for by clusters:
  # var_exp = intercluster variance / total variance
  var_exp = kmeans_obj$betweenss / kmeans_obj$totss
  var_exp  
}
explained_var_Rep = sapply(1:10, explained_variance, data_in = clust_data_Rep)
#View(explained_var_Rep)
#Create a elbow chart of the output 

# Data for ggplot2.
elbow_data_Rep = data.frame(k = 1:10, explained_var_Rep)
#View(elbow_data_Rep)

# Plotting data.
ggplot(elbow_data_Rep, 
       aes(x = k,  
           y = explained_var_Rep)) + 
  geom_point(size = 4) +           #<- sets the size of the data points
  geom_line(size = 1) +            #<- sets the thickness of the line
  xlab('k') + 
  ylab('Inter-cluster Variance / Total Variance') + 
  theme_light()

#Use NbClust to select a number of clusters
# Run NbClust.
(nbclust_obj_Rep = NbClust(data = clust_data_Rep, method = "kmeans"))
## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## Warning in log(det(P)/det(W)): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 12 proposed 2 as the best number of clusters 
## * 4 proposed 3 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 6 as the best number of clusters 
## * 2 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 1 proposed 14 as the best number of clusters 
## * 1 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
## $All.index
##          KL        CH Hartigan     CCC    Scott   Marriot     TrCovW    TraceW
## 2  131.0817 1650.8972 140.8739 76.7950      NaN -534.8540 4148081409 120764.50
## 3    0.8084 1166.7437 100.0769 73.2048      NaN -299.2478 2092420152  90700.26
## 4    0.7187  992.4333  54.3510 68.7774 16304.01  253.9222 1364506819  73380.28
## 5    0.3692  851.5327  92.8867 65.0656      NaN   -4.0054 1089970216  65025.23
## 6    0.6653  847.7351 109.7583 64.6344 16862.81  154.3630  711907875  53294.53
## 7   11.4461  906.7565  44.6891 65.8317 17305.83   74.4479  448521362  42273.48
## 8    0.5271  864.2396  44.9357 64.7719      NaN  -36.2018  365507694  38208.04
## 9    2.2064  840.9147  25.6650 64.1975 17740.77   44.4388  300834537  34507.30
## 10   0.4156  794.3214  38.9806 63.0588      NaN   -3.9473  274181075  32511.13
## 11   0.7418  783.7302  40.9588 62.8420      NaN  -65.7978  223035146  29731.84
## 12   0.4787  784.4653  58.9331 62.9414 17615.49  105.9411  184024352  27066.87
## 13   1.2365  824.1301  52.1301 64.0516 18438.62   18.0883  142963370  23701.14
## 14   2.4682  858.4573  32.3669 65.0020      NaN   -3.3429  111468031  21050.50
## 15   0.4565  859.8358  48.2537 65.1549 18991.72    6.5938   96034586  19520.66
##         Friedman    Rubin Cindex     DB Silhouette   Duda  Pseudot2   Beale
## 2  -1.196478e+14 127.4114 0.1605 0.5038     0.7149 1.1814  -31.0231 -0.2603
## 3  -3.098874e+14 169.6442 0.2177 0.7555     0.6854 4.2585 -152.2704 -1.2096
## 4   4.270813e+14 209.6854 0.1694 1.1057     0.4545 0.9574    7.2442  0.0717
## 5  -3.264814e+16 236.6277 0.1605 1.0231     0.4588 1.7872  -52.8556 -0.7360
## 6   8.442332e+14 288.7120 0.1330 1.1476     0.3324 1.7658  -32.9601 -0.7271
## 7   1.498087e+15 363.9817 0.1906 1.0181     0.3467 2.8985  -89.0786 -0.9757
## 8  -3.296079e+15 402.7103 0.1807 1.0080     0.3510 0.9977    0.2089  0.0039
## 9   2.747395e+15 445.8991 0.1743 0.9032     0.3627 0.5647   19.2747  1.2963
## 10 -3.297302e+16 473.2770 0.1938 1.0032     0.3274 6.8536 -116.1566 -1.3908
## 11 -2.061187e+15 517.5183 0.1906 0.9419     0.3232 0.7118   15.7916  0.6801
## 12  1.268548e+15 568.4725 0.1883 0.9347     0.3356 2.3591  -33.4148 -0.9416
## 13  6.598389e+15 649.1998 0.1706 0.9088     0.3436 2.0965  -11.5065 -0.8533
## 14 -3.306221e+16 730.9457 0.1539 0.8893     0.3603 6.8406  -61.4746 -1.3566
## 15  1.651659e+16 788.2302 0.2181 0.9026     0.3542 0.7263   24.1153  0.6289
##    Ratkowsky      Ball Ptbiserial    Frey McClain   Dunn Hubert SDindex  Dindex
## 2     0.5589 60382.252     0.8806  0.5216  0.2779 0.1005      0  0.0763 13.4946
## 3     0.5020 30233.420     0.8967  5.6954  0.2853 0.0782      0  0.0934 12.5274
## 4     0.4533 18345.069     0.6993  2.3087  0.5622 0.0156      0  0.1517 11.0551
## 5     0.4101 13005.045     0.6586  1.5951  0.6556 0.0156      0  0.1620 10.2381
## 6     0.3805  8882.422     0.5582  1.0498  0.9561 0.0156      0  0.1535  9.2055
## 7     0.3580  6039.068     0.5250  1.4422  1.0747 0.0245      0  0.1471  8.4672
## 8     0.3360  4776.005     0.4959  0.8232  1.2010 0.0245      0  0.1573  7.8994
## 9     0.3192  3834.144     0.4855  2.2024  1.2426 0.0245      0  0.1627  7.5574
## 10    0.3019  3251.113     0.4598  2.5479  1.3859 0.0280      0  0.1973  7.2851
## 11    0.2896  2702.895     0.4418  3.0898  1.5030 0.0280      0  0.2265  7.0407
## 12    0.2791  2255.573     0.4241  0.3815  1.6339 0.0280      0  0.2725  6.6572
## 13    0.2688  1823.164     0.4117  0.1275  1.6649 0.0280      0  0.2661  6.3009
## 14    0.2603  1503.607     0.4107 -7.8443  1.5852 0.0280      0  0.2114  5.9222
## 15    0.2516  1301.377     0.3854  0.4523  1.8242 0.0394      0  0.2505  5.7283
##      SDbw
## 2  0.2255
## 3  0.2677
## 4  0.4023
## 5  0.3697
## 6  0.4053
## 7  0.3781
## 8  0.3099
## 9  0.3628
## 10 0.2101
## 11 0.2945
## 12 0.2095
## 13 0.2059
## 14 0.1430
## 15 0.1779
## 
## $All.CriticalValues
##    CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2          0.6390           114.1248       1.0000
## 3          0.2115           742.0166       1.0000
## 4          0.2887           401.6280       0.9749
## 5          0.4868           126.4976       1.0000
## 6          0.5151            71.5437       1.0000
## 7          0.0438          2971.3276       1.0000
## 8          0.5066            86.6886       0.9997
## 9          0.5413            21.1850       0.2763
## 10         0.3322           273.4259       1.0000
## 11         0.5318            34.3418       0.5650
## 12         0.3500           107.6921       1.0000
## 13         0.3414            42.4447       1.0000
## 14         0.2298           241.3514       1.0000
## 15         0.4783            69.8184       0.5974
## 
## $Best.nc
##                       KL       CH Hartigan    CCC    Scott  Marriot     TrCovW
## Number_clusters   2.0000    2.000   7.0000  2.000  13.0000   5.0000          3
## Value_Index     131.0817 1650.897  65.0692 76.795 823.1304 416.2959 2055661257
##                   TraceW    Friedman    Rubin Cindex     DB Silhouette   Duda
## Number_clusters     3.00 1.50000e+01   7.0000  6.000 2.0000     2.0000 2.0000
## Value_Index     12744.26 4.95788e+16 -36.5411  0.133 0.5038     0.7149 1.1814
##                 PseudoT2   Beale Ratkowsky     Ball PtBiserial Frey McClain
## Number_clusters   2.0000  2.0000    2.0000     3.00     3.0000    1  2.0000
## Value_Index     -31.0231 -0.2603    0.5589 30148.83     0.8967   NA  0.2779
##                   Dunn Hubert SDindex Dindex   SDbw
## Number_clusters 2.0000      0  2.0000      0 14.000
## Value_Index     0.1005      0  0.0763      0  0.143
## 
## $Best.partition
##   [1] 2 2 2 2 1 1 1 2 1 2 1 1 1 1 2 1 2 2 1 1 1 1 2 2 1 2 1 2 1 2 2 2 2 1 1 2 1
##  [38] 1 2 1 2 2 2 2 2 1 2 2 1 2 2 2 2 2 1 1 2 1 2 2 2 2 2 1 1 1 1 2 2 1 2 2 2 2
##  [75] 2 1 1 2 1 1 1 2 1 1 1 1 2 2 2 2 1 1 1 2 2 2 1 2 2 2 2 1 1 2 1 2 1 1 1 1 1
## [112] 1 2 2 2 1 1 2 1 1 2 2 1 1 1 1 1 1 2 2 2 2 2 1 2 1 2 1 1 2 1 2 2 2 2 2 1 1
## [149] 2 2 1 1 1 2 1 2 1 2 1 2 1 1 2 1 2 1 2 1 1 1 2 2 1 2 2 2 2 1 1 1 1 2 1 2 2
## [186] 1 1 1 1 2 1 2 1 2 2 2 1 1 2 2 1 2 2 2 1 2 1 1 1 2 1 1 1 1 2 2 1 1 2 1 2 1
## [223] 2 2 2 1 1 1 1 1 1 2 2 1 1 1 2 2 1 2 2 2 2 1 1 1 1 1 2 2 2 2 1 1 2 2 2 1 1
## [260] 1 2 1 2 1 2 1 2 1 2 1 1 1 1 2 1 1 1 2 2 2 2 2 2 2 1 1 2 2 1 1 1 1 1 2 2 2
## [297] 2 1 2 1 1 1 1 2 1 2 2 1 2 2 1 1 1 2 2 1 2 1 1 2 1 1 2 2 1 1 2 2 1 1 1 2 1
## [334] 2 1 1 1 2 2 2 1 1 1 1 1 1 1 2 1 1 2 2 2 1 1 1 2 1 1 1 1 2 1 2 1 1 2 2 2 2
## [371] 1 1 1 2 1 2 2 2 1 2 1 1 2 1 1 2 1 2 1 1 1 1 1 2 1 2 2 1 2 2 1 1 1 1 1 2 1
## [408] 1 2 2 1 2 1 1 1 1 1 2 1 2 1 2 1 2 2 1 2
# View the output of NbClust.
nbclust_obj_Rep
## $All.index
##          KL        CH Hartigan     CCC    Scott   Marriot     TrCovW    TraceW
## 2  131.0817 1650.8972 140.8739 76.7950      NaN -534.8540 4148081409 120764.50
## 3    0.8084 1166.7437 100.0769 73.2048      NaN -299.2478 2092420152  90700.26
## 4    0.7187  992.4333  54.3510 68.7774 16304.01  253.9222 1364506819  73380.28
## 5    0.3692  851.5327  92.8867 65.0656      NaN   -4.0054 1089970216  65025.23
## 6    0.6653  847.7351 109.7583 64.6344 16862.81  154.3630  711907875  53294.53
## 7   11.4461  906.7565  44.6891 65.8317 17305.83   74.4479  448521362  42273.48
## 8    0.5271  864.2396  44.9357 64.7719      NaN  -36.2018  365507694  38208.04
## 9    2.2064  840.9147  25.6650 64.1975 17740.77   44.4388  300834537  34507.30
## 10   0.4156  794.3214  38.9806 63.0588      NaN   -3.9473  274181075  32511.13
## 11   0.7418  783.7302  40.9588 62.8420      NaN  -65.7978  223035146  29731.84
## 12   0.4787  784.4653  58.9331 62.9414 17615.49  105.9411  184024352  27066.87
## 13   1.2365  824.1301  52.1301 64.0516 18438.62   18.0883  142963370  23701.14
## 14   2.4682  858.4573  32.3669 65.0020      NaN   -3.3429  111468031  21050.50
## 15   0.4565  859.8358  48.2537 65.1549 18991.72    6.5938   96034586  19520.66
##         Friedman    Rubin Cindex     DB Silhouette   Duda  Pseudot2   Beale
## 2  -1.196478e+14 127.4114 0.1605 0.5038     0.7149 1.1814  -31.0231 -0.2603
## 3  -3.098874e+14 169.6442 0.2177 0.7555     0.6854 4.2585 -152.2704 -1.2096
## 4   4.270813e+14 209.6854 0.1694 1.1057     0.4545 0.9574    7.2442  0.0717
## 5  -3.264814e+16 236.6277 0.1605 1.0231     0.4588 1.7872  -52.8556 -0.7360
## 6   8.442332e+14 288.7120 0.1330 1.1476     0.3324 1.7658  -32.9601 -0.7271
## 7   1.498087e+15 363.9817 0.1906 1.0181     0.3467 2.8985  -89.0786 -0.9757
## 8  -3.296079e+15 402.7103 0.1807 1.0080     0.3510 0.9977    0.2089  0.0039
## 9   2.747395e+15 445.8991 0.1743 0.9032     0.3627 0.5647   19.2747  1.2963
## 10 -3.297302e+16 473.2770 0.1938 1.0032     0.3274 6.8536 -116.1566 -1.3908
## 11 -2.061187e+15 517.5183 0.1906 0.9419     0.3232 0.7118   15.7916  0.6801
## 12  1.268548e+15 568.4725 0.1883 0.9347     0.3356 2.3591  -33.4148 -0.9416
## 13  6.598389e+15 649.1998 0.1706 0.9088     0.3436 2.0965  -11.5065 -0.8533
## 14 -3.306221e+16 730.9457 0.1539 0.8893     0.3603 6.8406  -61.4746 -1.3566
## 15  1.651659e+16 788.2302 0.2181 0.9026     0.3542 0.7263   24.1153  0.6289
##    Ratkowsky      Ball Ptbiserial    Frey McClain   Dunn Hubert SDindex  Dindex
## 2     0.5589 60382.252     0.8806  0.5216  0.2779 0.1005      0  0.0763 13.4946
## 3     0.5020 30233.420     0.8967  5.6954  0.2853 0.0782      0  0.0934 12.5274
## 4     0.4533 18345.069     0.6993  2.3087  0.5622 0.0156      0  0.1517 11.0551
## 5     0.4101 13005.045     0.6586  1.5951  0.6556 0.0156      0  0.1620 10.2381
## 6     0.3805  8882.422     0.5582  1.0498  0.9561 0.0156      0  0.1535  9.2055
## 7     0.3580  6039.068     0.5250  1.4422  1.0747 0.0245      0  0.1471  8.4672
## 8     0.3360  4776.005     0.4959  0.8232  1.2010 0.0245      0  0.1573  7.8994
## 9     0.3192  3834.144     0.4855  2.2024  1.2426 0.0245      0  0.1627  7.5574
## 10    0.3019  3251.113     0.4598  2.5479  1.3859 0.0280      0  0.1973  7.2851
## 11    0.2896  2702.895     0.4418  3.0898  1.5030 0.0280      0  0.2265  7.0407
## 12    0.2791  2255.573     0.4241  0.3815  1.6339 0.0280      0  0.2725  6.6572
## 13    0.2688  1823.164     0.4117  0.1275  1.6649 0.0280      0  0.2661  6.3009
## 14    0.2603  1503.607     0.4107 -7.8443  1.5852 0.0280      0  0.2114  5.9222
## 15    0.2516  1301.377     0.3854  0.4523  1.8242 0.0394      0  0.2505  5.7283
##      SDbw
## 2  0.2255
## 3  0.2677
## 4  0.4023
## 5  0.3697
## 6  0.4053
## 7  0.3781
## 8  0.3099
## 9  0.3628
## 10 0.2101
## 11 0.2945
## 12 0.2095
## 13 0.2059
## 14 0.1430
## 15 0.1779
## 
## $All.CriticalValues
##    CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2          0.6390           114.1248       1.0000
## 3          0.2115           742.0166       1.0000
## 4          0.2887           401.6280       0.9749
## 5          0.4868           126.4976       1.0000
## 6          0.5151            71.5437       1.0000
## 7          0.0438          2971.3276       1.0000
## 8          0.5066            86.6886       0.9997
## 9          0.5413            21.1850       0.2763
## 10         0.3322           273.4259       1.0000
## 11         0.5318            34.3418       0.5650
## 12         0.3500           107.6921       1.0000
## 13         0.3414            42.4447       1.0000
## 14         0.2298           241.3514       1.0000
## 15         0.4783            69.8184       0.5974
## 
## $Best.nc
##                       KL       CH Hartigan    CCC    Scott  Marriot     TrCovW
## Number_clusters   2.0000    2.000   7.0000  2.000  13.0000   5.0000          3
## Value_Index     131.0817 1650.897  65.0692 76.795 823.1304 416.2959 2055661257
##                   TraceW    Friedman    Rubin Cindex     DB Silhouette   Duda
## Number_clusters     3.00 1.50000e+01   7.0000  6.000 2.0000     2.0000 2.0000
## Value_Index     12744.26 4.95788e+16 -36.5411  0.133 0.5038     0.7149 1.1814
##                 PseudoT2   Beale Ratkowsky     Ball PtBiserial Frey McClain
## Number_clusters   2.0000  2.0000    2.0000     3.00     3.0000    1  2.0000
## Value_Index     -31.0231 -0.2603    0.5589 30148.83     0.8967   NA  0.2779
##                   Dunn Hubert SDindex Dindex   SDbw
## Number_clusters 2.0000      0  2.0000      0 14.000
## Value_Index     0.1005      0  0.0763      0  0.143
## 
## $Best.partition
##   [1] 2 2 2 2 1 1 1 2 1 2 1 1 1 1 2 1 2 2 1 1 1 1 2 2 1 2 1 2 1 2 2 2 2 1 1 2 1
##  [38] 1 2 1 2 2 2 2 2 1 2 2 1 2 2 2 2 2 1 1 2 1 2 2 2 2 2 1 1 1 1 2 2 1 2 2 2 2
##  [75] 2 1 1 2 1 1 1 2 1 1 1 1 2 2 2 2 1 1 1 2 2 2 1 2 2 2 2 1 1 2 1 2 1 1 1 1 1
## [112] 1 2 2 2 1 1 2 1 1 2 2 1 1 1 1 1 1 2 2 2 2 2 1 2 1 2 1 1 2 1 2 2 2 2 2 1 1
## [149] 2 2 1 1 1 2 1 2 1 2 1 2 1 1 2 1 2 1 2 1 1 1 2 2 1 2 2 2 2 1 1 1 1 2 1 2 2
## [186] 1 1 1 1 2 1 2 1 2 2 2 1 1 2 2 1 2 2 2 1 2 1 1 1 2 1 1 1 1 2 2 1 1 2 1 2 1
## [223] 2 2 2 1 1 1 1 1 1 2 2 1 1 1 2 2 1 2 2 2 2 1 1 1 1 1 2 2 2 2 1 1 2 2 2 1 1
## [260] 1 2 1 2 1 2 1 2 1 2 1 1 1 1 2 1 1 1 2 2 2 2 2 2 2 1 1 2 2 1 1 1 1 1 2 2 2
## [297] 2 1 2 1 1 1 1 2 1 2 2 1 2 2 1 1 1 2 2 1 2 1 1 2 1 1 2 2 1 1 2 2 1 1 1 2 1
## [334] 2 1 1 1 2 2 2 1 1 1 1 1 1 1 2 1 1 2 2 2 1 1 1 2 1 1 1 1 2 1 2 1 1 2 2 2 2
## [371] 1 1 1 2 1 2 2 2 1 2 1 1 2 1 1 2 1 2 1 1 1 1 1 2 1 2 2 1 2 2 1 1 1 1 1 2 1
## [408] 1 2 2 1 2 1 1 1 1 1 2 1 2 1 2 1 2 2 1 2
# View the output that shows the number of clusters each method recommends.
#View(nbclust_obj_Rep$Best.nc)
#Display the results visually 
freq_k_Rep = nbclust_obj_Rep$Best.nc[1,]
freq_k_Rep = data.frame(freq_k_Rep)
#View(freq_k_Rep)

# Check the maximum number of clusters suggested.
max(freq_k_Rep)
## [1] 15
#essentially resets the plot viewer back to default
#dev.off()

# Plot as a histogram.
ggplot(freq_k_Rep,
       aes(x = freq_k_Rep)) +
  geom_bar() +
  scale_x_continuous(breaks = seq(0, 15, by = 1)) +
  scale_y_continuous(breaks = seq(0, 12, by = 1)) +
  labs(x = "Number of Clusters",
       y = "Number of Votes",
       title = "Cluster Analysis")

#Using the recommended number of cluster compare the quality of the model 
#with 2 clusters 
# Both the elbow graph and the nbc Cluster method recommend two clusters.  
#Bonus: Create a 3d version of the output

party_color3D_Rep = data.frame(party.labels = c("Democrat", "Republican"),
                               color = c("blue", "red"))

View(party_color3D_Rep)


# Join the new data frame to our house_votes_Dem data set.
house_votes_color_Rep = inner_join(house_votes_Rep, party_color3D_Rep)
## Joining, by = "party.labels"
house_votes_color_Rep$clusters <- (party_clusters_Rep)

str(house_votes_color_Rep)
## spec_tbl_df[,7] [427 x 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Last.Name   : chr [1:427] "Courtney" "Lewis" "Bera" "McCollum" ...
##  $ party.labels: chr [1:427] "Democrat" "Democrat" "Democrat" "Democrat" ...
##  $ aye         : num [1:427] 66 59 84 74 127 125 125 67 97 68 ...
##  $ nay         : num [1:427] 163 145 141 154 103 91 95 142 99 147 ...
##  $ other       : num [1:427] 91 116 95 92 90 104 100 111 124 105 ...
##  $ color       : chr [1:427] "blue" "blue" "blue" "blue" ...
##  $ clusters    : Factor w/ 2 levels "1","2": 2 2 2 2 1 1 1 2 1 2 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Last.Name = col_character(),
##   ..   party.labels = col_character(),
##   ..   aye = col_double(),
##   ..   nay = col_double(),
##   ..   other = col_double()
##   .. )
#Remove special characters
house_votes_color_Rep$Last.Name <- gsub("[^[:alnum:]]", "", house_votes_color_Rep$Last.Name)

# Use plotly to do a 3d imaging 

fig <- plot_ly(house_votes_color_Rep, 
               type = "scatter3d",
               mode="markers",
               symbol = ~clusters,
               x = ~aye, 
               y = ~nay, 
               z = ~other,
               color = ~color,
               colors = c('#0C4B8E','#BF382A'), 
               text = ~paste('Representative:',Last.Name,
                             "Party:",party.labels))


fig
# dev.off()

#Flat so the other category isn't impacting the clusters very much